library(tidyverse)
library(readr)
library(purrr)
library(plotly)

Problem 1

import, clean and describe data

  • I created the variable city_state.
homicide = read_csv("homicide-data.csv") |>
  janitor::clean_names() |>
  dplyr::mutate(
    city_state = paste(city,",",state),
    city_state = str_replace(city_state,"Tulsa , AL", "Tulsa , OK")) 
## Rows: 52179 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): uid, victim_last, victim_first, victim_race, victim_age, victim_sex...
## dbl (3): reported_date, lat, lon
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

the raw data have 52179 records and13 variables. Column names are uid, reported_date, victim_last, victim_first, victim_race, victim_age, victim_sex, city, state, lat, lon, disposition, city_state.The data included the location of the killing, whether an arrest was made and, in most cases, basic demographic information about each victim.

summerize and get the proportion

  • I summarized within cities to obtain the total number of homicides and the number of unsolved homicides (those for which the disposition is “Closed without arrest” or “Open/No arrest”).

  • For the city of Baltimore, MD, use the prop.test function to estimate the proportion of homicides that are unsolved; save the output of prop.test as an R object, apply the broom::tidy to this object and pull the estimated proportion and confidence intervals from the resulting tidy dataframe.

p = homicide |>
  group_by(city_state) |># there's no missing in city or state variables
  summarise(total_count = n(),
            unsolved_count = sum(disposition %in% c("Closed without arrest","Open/No arrest"))) |>
  filter(city_state == "Baltimore , MD") 

prop.test(pull(p,unsolved_count), pull(p,total_count),conf.level = .95) |>
  broom::tidy() |>
  select(estimate,conf.low, conf.high)
## # A tibble: 1 × 3
##   estimate conf.low conf.high
##      <dbl>    <dbl>     <dbl>
## 1    0.646    0.628     0.663

get the proportions for all locations

Now run prop.test for each of the cities in your dataset, and extract both the proportion of unsolved homicides and the confidence interval for each. Do this within a “tidy” pipeline, making use of purrr::map, purrr::map2, list columns and unnest as necessary to create a tidy dataframe with estimated proportions and CIs for each city.

I had a function calculating prop and its 95%CI.

prop = function(df){
  
  pp = prop.test(pull(df,unsolved_count), pull(df,total_count),conf.level = .95) |>
    broom::tidy() |>
    select(estimate,conf.low, conf.high)
}

prop(p)#check if the function works

data_city = homicide |>
  group_by(city_state) |># there's no missing in city or state variables
  summarise(total_count = n(),
            unsolved_count = sum(disposition %in% c("Closed without arrest","Open/No arrest"))) 

result <- data_city |>
  group_by(city_state) |>
  nest() |>
  mutate(proportion_test = map(data, prop)) |>
  select(proportion_test) |>
  unnest(proportion_test)
## Adding missing grouping variables: `city_state`
#do I need to combine the two locations in Tulsa

visualization

Create a plot that shows the estimates and CIs for each city – check out geom_errorbar for a way to add error bars based on the upper and lower limits. Organize cities according to the proportion of unsolved homicides.

result |>
  mutate(city_state = fct_reorder(city_state,estimate)) |>
  ggplot(aes(x = city_state, y = estimate, color = city_state)) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))

Problem 2

import and clean data

longitudinal = function(path,filename) {
  
  df = 
    read_csv(path) |>
    janitor::clean_names() |>
    mutate(id = filename) |>
    pivot_longer(
      cols = -id,
      names_to = "week",
      values_to = "value",
      names_prefix = "week_") |>
    separate(id,c("arm","subject_id"),sep="_") |>
    mutate(
      arm = recode(
        arm,
        con = "control",
        exp = "experimental"),
      subject_id = gsub("\\.csv$","",subject_id))

  df
  
}

filelst = list.files("./data",full.names = TRUE)

q3_tidy = 
  purrr::map(filelst, ~ longitudinal(.x, basename(.x))) |> 
  bind_rows()
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): week_1, week_2, week_3, week_4, week_5, week_6, week_7, week_8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q3_tidy |>
  group_by(week) |> 
  summarise(n = n())
## # A tibble: 8 × 2
##   week      n
##   <chr> <int>
## 1 1        20
## 2 2        20
## 3 3        20
## 4 4        20
## 5 5        20
## 6 6        20
## 7 7        20
## 8 8        20
q3_tidy |>
  group_by(subject_id) |> 
  summarise(n = n())   
## # A tibble: 10 × 2
##    subject_id     n
##    <chr>      <int>
##  1 01            16
##  2 02            16
##  3 03            16
##  4 04            16
##  5 05            16
##  6 06            16
##  7 07            16
##  8 08            16
##  9 09            16
## 10 10            16
q3_tidy |>
  group_by(week, subject_id) |> 
  summarise(n = n())
## `summarise()` has grouped output by 'week'. You can override using the
## `.groups` argument.
## # A tibble: 80 × 3
## # Groups:   week [8]
##    week  subject_id     n
##    <chr> <chr>      <int>
##  1 1     01             2
##  2 1     02             2
##  3 1     03             2
##  4 1     04             2
##  5 1     05             2
##  6 1     06             2
##  7 1     07             2
##  8 1     08             2
##  9 1     09             2
## 10 1     10             2
## # ℹ 70 more rows

make a spaghetti plot

Make a spaghetti plot showing observations on each subject over time, and comment on differences between groups

q3_tidy |>
  ggplot(aes(x = week, y = value, color = subject_id)) +
  geom_line(aes(group = subject_id),se = FALSE)+
  facet_grid(~arm)
## Warning in geom_line(aes(group = subject_id), se = FALSE): Ignoring unknown
## parameters: `se`

Participants in control arm didn’t show significant change in value over the 8 weeks of study, whereas participants in experimental arm saw a considerable increase in value over the 8 weeks.

Problem 3

mu = 0

set.seed(1)

p3 = function(mu,n = 30,sigma = 5){#mu must be in the first place?

  sim_data = tibble(
     x = rnorm(n, mean = mu, sd = sigma),
  )
    sim_data |>
    t.test() |> broom::tidy() |>
    select(estimate, p.value) |>
    janitor::clean_names()
}

results_df =   
  map(1:5000, \(i) p3(mu = 0)) |> bind_rows()

mu =1:6

sim_results_df = 
  expand_grid(
    mu = 1:6,
    iter = 1:5000) |> 
  mutate(
    estimate_df = map(mu,p3)) |> 
  unnest(estimate_df)

visualization

Make a plot showing the proportion of times the null was rejected (the power of the test) on the y axis and the true value of μ on the x axis. Describe the association between effect size and power.

plot_df = sim_results_df |>
  mutate(
    conclusion = ifelse(p_value < 0.05, "reject","not reject")) |>
  group_by(mu) |>
  summarise(total_count = n(),
            reject = sum(conclusion == "reject")) |>
  mutate(prop = reject/total_count) 

power = plot_df |>
  plot_ly(x = ~mu, y = ~prop, type = 'scatter', mode = "line") |>
  layout(title = 'Association between effect size and power', plot_bgcolor = "#e5ecf6",
         xaxis = list(title = "True mu",
                     tickvals = seq(1, 6, by = 1),
                     ticktext = seq(1, 6, by = 1)),
         yaxis = list(title = "Power")) 
power
  • the power increases as the effect size increases. For a study with a sample size of 30, an effect size=5 has been larger enough to bring the power of t test up to 1.

Make a plot showing the average estimate of μ^ on the y axis and the true value of μ on the x axis.

estimate_all = sim_results_df |>
  plot_ly(x = ~mu, y = ~estimate, type = 'box', name = "all") |>
  layout(title = 'Association between true mu and its estimate', plot_bgcolor = "#e5ecf6",
         xaxis = list(title = "True mu",
                     tickvals = seq(1, 6, by = 1),
                     ticktext = seq(1, 6, by = 1)),
         yaxis = list(title = "Estimate")) 
estimate_all

Make a second plot (or overlay on the first) the average estimate of μ^ only in samples for which the null was rejected on the y axis and the true value of μ on the x axis. Is the sample average of μ^ across tests for which the null is rejected approximately equal to the true value of μ ? Why or why not?

estimate_rej = sim_results_df |>
  filter(p_value<0.05)
 

compare = add_trace(estimate_all, x = ~pull(estimate_rej,mu), y = ~pull(estimate_rej,estimate), type = 'box', name = "reject") |> layout(legend=list(title=list(text='Data Range')))

compare